Show the code
import pandas as pd
import numpy as np
from lets_plot import *
LetsPlot.setup_html(isolated_frame=True)import pandas as pd
import numpy as np
from lets_plot import *
LetsPlot.setup_html(isolated_frame=True)For Project 1 the answer to each question should include a chart and a written response. The years labels on your charts should not include a comma. At least two of your charts must include reference marks.
# Learn morea about Code Cells: https://quarto.org/docs/reference/cells/cells-jupyter.html
# Include and execute your code here
df = pd.read_csv("https://github.com/byuidatascience/data4names/raw/master/data-raw/names_year/names_year.csv")How does your name at your birth year compare to its use historically?
In my birth year 1967, there were 48,900 people named Michael.
The name was used the most in 1954 and has mostly been on a decline since.
In 1990, there was a brief resurgence, where the name was used 44,043 times.
# Set the name and birth year
name = "Michael"
birth_year = 1967
# Filter the data for the specified name
df_name = df[df['name'] == name]
# Find the number of occurrences of the name in the specified birth year
df_name_birth_year = df_name[df_name['year'] == birth_year]
name_birth_year_usage = df_name_birth_year['Total'].values[0] if not df_name_birth_year.empty else 0
# Summarize the historical usage of the specified name
historical_usage = df_name[['year', 'Total']].sort_values(by='year').copy()
# Ensure the DataFrame columns are of the correct type
historical_usage['year'] = historical_usage['year'].astype(int)
historical_usage['Total'] = historical_usage['Total'].astype(float)
if not df_name_birth_year.empty:
df_name_birth_year = df_name_birth_year[['year', 'Total']].copy()
df_name_birth_year['year'] = df_name_birth_year['year'].astype(int)
df_name_birth_year['Total'] = df_name_birth_year['Total'].astype(float)
# Convert DataFrame to a dictionary for LetsPlot
historical_usage_dict = historical_usage.to_dict(orient='list')
if not df_name_birth_year.empty:
df_name_birth_year_dict = df_name_birth_year.to_dict(orient='list')
else:
df_name_birth_year_dict = {'year': [], 'Total': []}
# Find the min and max years in the data for setting x-axis limits
min_year = df_name['year'].min()
max_year = df_name['year'].max()
# Create the base plot
p = (
ggplot(historical_usage_dict, aes(x='year', y='Total')) +
geom_line(color='blue', size=1) +
ggtitle(f"Usage of the name '{name}' over the years (Including {birth_year})") +
xlab("Year") +
ylab("Number of Occurrences") +
scale_x_continuous(
format=".0f", # Ensures no commas in x-axis labels
limits=(min_year, max_year), # Adjust this dynamically if data range changes
breaks=list(range(min_year, max_year+1, 10)) # Tick every 10 years
) +
theme_minimal() +
ggsize(900, 540) # Set plot size
)
# Highlight the birth year usage
if df_name_birth_year_dict['year']:
p += geom_point(
data=df_name_birth_year_dict,
mapping=aes(x='year', y='Total'),
color='red',
size=5,
tooltips=layer_tooltips()
.line("Year|@year")
.line("Usage|@Total")
)
# Render the plot
p.show()If you talked to someone named Brittany on the phone, what is your guess of his or her age? What ages would you not guess?
By plotting the distribution of when Brittany was born, we can compare it to the current year.
Someone named Brittany is most likely around 35 years old in 2025.
# Define the name
name = "Brittany" # Replace with any name you want to search for
# Filter the data for the specified name
df_name = df[df['name'] == name]
# Summarize the historical usage of the specified name
historical_usage = df_name[['year', 'Total']].sort_values(by='year').copy()
# Find the peak usage year and the guessable age
peak_usage_year = historical_usage.loc[historical_usage['Total'].idxmax()]['year']
peak_usage = historical_usage['Total'].max()
guessable_age = 2025 - peak_usage_year # Assuming current year is 2025
# Find the years with minimal usage (low occurrences)
min_usage_years = historical_usage[historical_usage['Total'] < historical_usage['Total'].quantile(0.1)] # Bottom 10% usage
# Convert data to dictionaries for Lets-Plot
historical_usage_dict = historical_usage.to_dict(orient='list')
peak_usage_dict = {'year': [peak_usage_year], 'Total': [peak_usage]}
low_usage_dict = min_usage_years.to_dict(orient='list')
# Create the base plot
p = (
ggplot(historical_usage_dict, aes(x='year', y='Total')) +
geom_line(color='blue', size=1, label=f"Usage of '{name}'") +
ggtitle(f"Usage of the name '{name}' over the years") +
xlab("Year") +
ylab("Number of Occurrences") +
scale_x_continuous(format=".0f") + # Ensures no commas in x-axis labels
theme_minimal() +
ggsize(900, 540) # Set plot size
)
# Highlight the peak usage year
p += geom_point(
data=peak_usage_dict,
mapping=aes(x='year', y='Total'),
color='red',
size=5,
tooltips=layer_tooltips()
.line("Year|@year")
.line("Usage|@Total")
)
# Highlight low usage years
p += geom_point(
data=low_usage_dict,
mapping=aes(x='year', y='Total'),
color='green',
size=6,
shape=16,
alpha=0.8, # Adjust transparency
tooltips=layer_tooltips()
.line("Year|@year")
.line("Usage|@Total")
)
# Render the plot
p.show()
# Output answers to the questions
print("")
print(f"Guessing the age of someone named {name}:")
print(f"The peak year for the name {name} was {int(peak_usage_year)},\n"
f"which means someone named {name} is most likely around {int(guessable_age)} years old in 2025.")
print(f"Ages you would not guess for {name} are likely in the years\n"
f"{', '.join(map(str, min_usage_years['year'].values))} (years highlighted in green dots).\n"
f"These years have very low usage of the name.")
print("")
Guessing the age of someone named Brittany:
The peak year for the name Brittany was 1990,
which means someone named Brittany is most likely around 35 years old in 2025.
Ages you would not guess for Brittany are likely in the years
1968, 1969, 1970, 1971, 1972 (years highlighted in green dots).
These years have very low usage of the name.
Mary, Martha, Peter, and Paul are all Christian names. From 1920 - 2000, compare the name usage of each of the four names in a single chart. What trends do you notice?
I need to add some amazing analysis here
Mary was signicantly more popular from 1920 - 1950
Now all 4 names are not that popular
# List of names to compare
names_to_compare = ["Mary", "Martha", "Peter", "Paul"]
# Filter the data for the names of interest and the years 1920 - 2000
df_filtered = df[(df['name'].isin(names_to_compare)) & (df['year'] >= 1920) & (df['year'] <= 2000)]
# Convert filtered data to dictionary for Lets-Plot
df_filtered_dict = df_filtered.to_dict(orient='list')
# Create the base plot
p = (
ggplot(df_filtered_dict, aes(x='year', y='Total', color='name')) + # 'name' will map to color
geom_line(size=1) + # Line plot for each name
ggtitle("Usage of Christian Names (Mary, Martha, Peter, Paul) from 1920 to 2000") +
xlab("Year") +
ylab("Number of Occurrences") +
scale_x_continuous(format=".0f") + # Ensure years display without commas
theme_minimal() +
ggsize(900, 540) # Set plot size
)
# Render the plot
p.show()Think of a unique name from a famous movie. Plot the usage of that name and see how changes line up with the movie release. Does it look like the movie had an effect on usage?
The name I selected was Forreest fromm the movie “Forrest Gump”, it was released July 6, 1994
The name Forrest really peaked at the time when the movie came out in 1994
This shows the power a movie can have on people selecting that name for their child
# Set the name and movie release year
name = "Forrest"
movie_release_date = "1994-07-06"
# Filter the data for the specified name
df_name = df[df['name'] == name]
# Convert data to dictionary for Lets-Plot
df_name_dict = df_name.to_dict(orient='list')
# Extract the movie release year
movie_release_year = int(movie_release_date.split('-')[0])
# Find the min and max years in the data for setting x-axis limits
min_year = df_name['year'].min()
max_year = df_name['year'].max()
# Create the base plot
p = (
ggplot(df_name_dict, aes(x='year', y='Total')) +
geom_line(color='blue', size=1, label=f"Usage of '{name}'") + # Plot the historical usage of the name
ggtitle(f"Usage of the name '{name}' over the years (Including {movie_release_year})") +
xlab("Year") +
ylab("Number of Occurrences") +
scale_x_continuous(format="d", limits=(min_year, max_year), expand=(0, 0.05)) + # Adjust expand to lower the x-axis
theme_minimal() +
ggsize(900, 540) # Set plot size
)
# Add the movie release year as a vertical dashed line
p += geom_vline(xintercept=movie_release_year, color='red', linetype='dashed', size=1, label=f"Movie Release ({movie_release_year})")
# Adjust the theme to move axis labels and avoid overlap
p += theme(
axis_title_x=element_text(margin=15),
axis_title_y=element_text(margin=15),
axis_text_x=element_text(margin=10),
axis_text_y=element_text(margin=10),
plot_margin=15
)
# Render the plot
p.show()Reproduce the chart Elliot using the data from the names_year.csv file.
I need to add some amazing analysis here.
The name Elliott is what is being analyzed.
There were 3 releases of ET, 1982, 1985 and 2002
We can trend the name to see the impact
# Set the name and movie release dates
name = "Elliot" # Name to analyze
movie_release_dates = [
("1982-06-11", "E.T Released"),
("1985-07-19", "Second Release"),
("2002-03-22", "Third Release")
]
# Filter the data for the specified name and years from 1950 onward
df_name = df[(df['name'] == name) & (df['year'] >= 1950)]
# Convert data to a dictionary for Lets-Plot
df_name_dict = df_name.to_dict(orient='list')
# Set limits from 1950 to 2020
min_year = 1950
max_year = 2020
# Add a label for the legend
df_name_dict['group'] = [f"{name}"] * len(df_name_dict['year']) # Add a group label for the line
p = (
ggplot(df_name_dict, aes(x='year', y='Total')) + # Base aesthetics
geom_line(aes(color='group'), size=1, show_legend=True) + # Map 'group' to color for the legend
scale_color_manual(values=["blue"]) + # Explicitly set the color of the main line to blue
labs(color="name") + # Change legend title to 'Name'
ggtitle("Elliot... What?") +
xlab("year") +
ylab("Total") +
scale_x_continuous(
format=".0f", # Ensures no commas in x-axis labels
limits=(min_year, max_year),
breaks=list(range(min_year, max_year+1, 10)) # Tick every 10 years
) +
scale_y_continuous(
limits=(0, 1200), # Set y-axis range
breaks=list(range(0, 1201, 200)), # Tick every 200
format=".0f" # Removes commas from Y-axis labels
) +
theme_minimal() +
theme(
panel_background=element_rect(fill='lightblue', color=None), # Set background to light blue
legend_background=element_rect(fill='white', color=None), # Set legend background to white
legend_key=element_rect(fill=None, color=None), # Fully remove background and border of the legend key
) +
ggsize(900, 540) # Set plot size
)
# Add the movie release years as vertical dashed lines with custom labels
for i, (release_year, label) in enumerate(movie_release_dates):
year = int(release_year.split('-')[0])
p += geom_vline(xintercept=year, color='red', linetype='dashed', size=1)
# Adjust text position for each label to be closer to the red lines without touching them
if i == 0: # Label 1: "E.T Released" (left of the line)
p += geom_text(x=year - 0.5, y=1150, label=label, color='black', size=10, hjust=1, vjust=1)
elif i == 1: # Label 2: "Second Release" (right of the line)
p += geom_text(x=year + 0.5, y=1150, label=label, color='black', size=10, hjust=0, vjust=1)
elif i == 2: # Label 3: "Third Release" (right of the line)
p += geom_text(x=year + 0.5, y=1150, label=label, color='black', size=10, hjust=0, vjust=1)
# Render the plot
p.show()